In [1]:
#pip install tensorflow
In [2]:
#pip install matplotlib
In [3]:
#pip install climetlab
In [4]:
#pip install climetlab-eumetnet-postprocessing-benchmark
In [5]:
#pip install --upgrade jupyterlab ipywidgets
In [6]:
#pip install requests
In [7]:
#pip install tensorrt
In [8]:
#pip install plotly
In [9]:
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import joblib # For saving scaler model
import os
import pickle
import plotly.graph_objects as go
import pandas as pd
import climetlab as cml
from datetime import datetime
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, ConvLSTM2D, Flatten
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import ConvLSTM2D, Flatten, Dense, Dropout, BatchNormalization, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.optimizers.schedules import ExponentialDecay
from tensorflow.keras.regularizers import l2
from keras.callbacks import ModelCheckpoint
2024-04-29 20:43:17.446840: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. 2024-04-29 20:43:18.827261: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
In [10]:
def fetch_data(start_date = '2018-01-01T00', end_date='2024-01-01T00'):
url_template = ("https://dataset.api.hub.geosphere.at/v1/timeseries/historical/inca-v1-1h-1km?"
"parameters=RR¶meters=GL¶meters=P0¶meters=T2M¶meters=TD2M&"
"parameters=RH2M¶meters=UU¶meters=VV&start={start_date}%3A00&"
"end={end_date}%3A00&lat_lon=48.333056%2C16.631944&"
"lat_lon=48.115278%2C16.175833&output_format=geojson&"
"filename=INCA_Vienna_timeseries_all_parameters_{start_year}_{end_year}")
formatted_url = url_template.format(start_date=start_date, end_date=end_date,
start_year=start_date[:4], end_year=end_date[:4])
print(formatted_url)
response = requests.get(formatted_url)
if response.status_code == 200:
data = response.json()
print("Fetched data successfully")
else:
print("Failed to retrieve data", response.status_code)
# Processing the data
data_list = []
for feature in data['features']:
location = feature['geometry']['coordinates']
for i, timestamp in enumerate(data['timestamps']):
record = {
'time': pd.to_datetime(timestamp),
'longitude': location[0],
'latitude': location[1],
'RR': feature['properties']['parameters']['RR']['data'][i],
'T2M': feature['properties']['parameters']['T2M']['data'][i],
'RH2M': feature['properties']['parameters']['RH2M']['data'][i],
'TD2M': feature['properties']['parameters']['TD2M']['data'][i],
'GL': feature['properties']['parameters']['GL']['data'][i],
'UU': feature['properties']['parameters']['UU']['data'][i],
'VV': feature['properties']['parameters']['VV']['data'][i],
'P0': feature['properties']['parameters']['P0']['data'][i]
}
data_list.append(record)
original_df = pd.DataFrame(data_list)
return original_df
In [11]:
def calculate_rolling_features(df, window_size, feature_name):
if len(df) < window_size:
window_size = len(df) # Adjust window size to length of the data if data is too short
df[f'{feature_name}_rolling_mean'] = df['T2M'].rolling(window=window_size).mean()
df[f'{feature_name}_rolling_max'] = df['T2M'].rolling(window=window_size).max()
df[f'{feature_name}_rolling_min'] = df['T2M'].rolling(window=window_size).min()
df[f'{feature_name}_rolling_std'] = df['T2M'].rolling(window=window_size).std()
def calculate_ewm_features(df, span, feature_name):
df[f'{feature_name}_ewm'] = df['T2M'].ewm(span=span, adjust=False).mean()
def handle_na(df):
df.fillna(method='ffill', inplace=True) # Forward fill first to propagate last valid observation forward
df.fillna(method='bfill', inplace=True) # Backward fill to ensure no NaNs remain
def feature_engineering(df):
df.dropna(inplace=True) # Drop rows with NaN values first
original_time_index = df['time'].copy()
# Basic date-time features
df['hour'] = df['time'].dt.hour
df['day_of_year'] = df['time'].dt.dayofyear
df['week_of_year'] = df['time'].dt.isocalendar().week
df['day_of_month'] = df['time'].dt.day
df['month'] = df['time'].dt.month
df['year'] = df['time'].dt.year
# Wind and interaction features
df['wind_speed'] = np.sqrt(df['UU']**2 + df['VV']**2)
df['wind_direction'] = np.arctan2(df['VV'], df['UU']) * 180 / np.pi
df['temp_humidity_interaction'] = df['T2M'] * df['RH2M']
df['wind_rain_interaction'] = df['wind_speed'] * df['RR']
df['RR_lag1'] = df['RR'].shift(1)
df['T2M_lag1'] = df['T2M'].shift(1)
# Cyclical time features
df['hour_sin'] = np.sin(2 * np.pi * df['hour']/24)
df['hour_cos'] = np.cos(2 * np.pi * df['hour']/24)
df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
df['day_sin'] = np.sin(2 * np.pi * df['day_of_year']/365)
df['day_cos'] = np.cos(2 * np.pi * df['day_of_year']/365)
# Rolling and EWMA features
#calculate_rolling_features(df, 6, '6h') # 24-hour window
#calculate_rolling_features(df, 12, '12h') # 24-hour window
calculate_rolling_features(df, 24, '24h') # 24-hour window
#calculate_rolling_features(df, 24*7, '7d') # 7-day window
calculate_rolling_features(df, 24*365, '365d') # 7-day window
#calculate_ewm_features(df, 6, '6h') # EWMA for 24 hours
#calculate_ewm_features(df, 12, '12h') # EWMA for 24 hours
calculate_ewm_features(df, 24, '24h') # EWMA for 24 hours
#calculate_ewm_features(df, 24*7, '7d') # EWMA for 7 days
calculate_ewm_features(df, 24*365, '365d') # EWMA for 7 days
# Handle NaNs intelligently
handle_na(df)
# Ensure only numeric types are used for modeling
df = df.select_dtypes(include=[np.number])
return df, original_time_index
In [12]:
def get_data_for_convlstm(start_date, end_date):
df, time_index = feature_engineering(fetch_data(start_date, end_date).copy())
return df, time_index
In [13]:
def create_sequences(input_data, param, sequence_length, feature_count, hours_to_predict, include_target=True):
X, y = [], []
for i in range(len(input_data) - sequence_length - hours_to_predict + 1): # Adjusted for 24 hours ahead
X.append(input_data.iloc[i:(i + sequence_length)].values)
if include_target:
y.append(input_data.iloc[i + sequence_length:i + sequence_length + hours_to_predict][param].values) # 24 hours target
if include_target:
return np.array(X), np.array(y)
else:
return np.array(X) # Return only X if include_target is False
In [14]:
preprocessed_df = fetch_data()
https://dataset.api.hub.geosphere.at/v1/timeseries/historical/inca-v1-1h-1km?parameters=RR¶meters=GL¶meters=P0¶meters=T2M¶meters=TD2M¶meters=RH2M¶meters=UU¶meters=VV&start=2018-01-01T00%3A00&end=2024-01-01T00%3A00&lat_lon=48.333056%2C16.631944&lat_lon=48.115278%2C16.175833&output_format=geojson&filename=INCA_Vienna_timeseries_all_parameters_2018_2024 Fetched data successfully
In [15]:
preprocessed_df_copy = preprocessed_df.copy()
In [16]:
feature_df, time_index = feature_engineering(preprocessed_df)
/tmp/ipykernel_38954/935309804.py:13: FutureWarning: DataFrame.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead. df.fillna(method='ffill', inplace=True) # Forward fill first to propagate last valid observation forward /tmp/ipykernel_38954/935309804.py:14: FutureWarning: DataFrame.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead. df.fillna(method='bfill', inplace=True) # Backward fill to ensure no NaNs remain
In [17]:
feature_df_copy = feature_df.copy()
In [18]:
df = feature_df.copy()
df
Out[18]:
| longitude | latitude | RR | T2M | RH2M | TD2M | GL | UU | VV | P0 | ... | 24h_rolling_mean | 24h_rolling_max | 24h_rolling_min | 24h_rolling_std | 365d_rolling_mean | 365d_rolling_max | 365d_rolling_min | 365d_rolling_std | 24h_ewm | 365d_ewm | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 16.638773 | 48.334339 | 0.000 | 1.68 | 94.58 | 0.91 | 0.0 | -1.08 | 1.06 | 101026.92 | ... | 3.458333 | 7.97 | 0.40 | 2.713545 | 12.067063 | 34.98 | -13.05 | 9.608740 | 1.680000 | 1.680000 |
| 1 | 16.638773 | 48.334339 | 0.000 | 1.17 | 96.23 | 0.64 | 0.0 | -2.09 | 0.80 | 100967.53 | ... | 3.458333 | 7.97 | 0.40 | 2.713545 | 12.067063 | 34.98 | -13.05 | 9.608740 | 1.639200 | 1.679884 |
| 2 | 16.638773 | 48.334339 | 0.000 | 0.40 | 99.42 | 0.32 | 0.0 | 0.27 | 0.29 | 100957.14 | ... | 3.458333 | 7.97 | 0.40 | 2.713545 | 12.067063 | 34.98 | -13.05 | 9.608740 | 1.540064 | 1.679591 |
| 3 | 16.638773 | 48.334339 | 0.000 | 0.51 | 100.00 | 0.51 | 0.0 | 0.61 | 0.21 | 100949.28 | ... | 3.458333 | 7.97 | 0.40 | 2.713545 | 12.067063 | 34.98 | -13.05 | 9.608740 | 1.457659 | 1.679324 |
| 4 | 16.638773 | 48.334339 | 0.000 | 0.49 | 100.00 | 0.49 | 0.0 | 0.76 | 0.43 | 100939.71 | ... | 3.458333 | 7.97 | 0.40 | 2.713545 | 12.067063 | 34.98 | -13.05 | 9.608740 | 1.380246 | 1.679053 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 105165 | 16.181034 | 48.112373 | 0.000 | 2.26 | 86.19 | 0.20 | 0.0 | 0.19 | 0.19 | 100935.99 | ... | 1.450000 | 3.45 | -0.91 | 1.274571 | 11.459591 | 33.79 | -10.14 | 8.366507 | 2.314958 | 11.471637 |
| 105166 | 16.181034 | 48.112373 | 0.000 | 2.51 | 82.91 | -0.09 | 0.0 | 1.08 | 0.49 | 100932.03 | ... | 1.420417 | 3.45 | -0.91 | 1.239458 | 11.459307 | 33.79 | -10.14 | 8.366769 | 2.330561 | 11.469591 |
| 105167 | 16.181034 | 48.112373 | 0.000 | 1.84 | 84.37 | -0.51 | 0.0 | 0.20 | 0.17 | 101026.68 | ... | 1.353333 | 3.12 | -0.91 | 1.166241 | 11.458772 | 33.79 | -10.14 | 8.367234 | 2.291316 | 11.467392 |
| 105168 | 16.181034 | 48.112373 | 0.000 | 4.47 | 78.51 | 1.07 | 0.0 | 4.03 | 1.66 | 100941.54 | ... | 1.409583 | 4.47 | -0.91 | 1.281970 | 11.458508 | 33.79 | -10.14 | 8.367418 | 2.465611 | 11.465795 |
| 105169 | 16.181034 | 48.112373 | 0.002 | 4.82 | 82.31 | 2.06 | 0.0 | 1.68 | 0.81 | 101031.94 | ... | 1.500833 | 4.82 | -0.91 | 1.440727 | 11.458150 | 33.79 | -10.14 | 8.367635 | 2.653962 | 11.464278 |
105110 rows × 38 columns
In [19]:
param = 'T2M'
sequence_length = 24
feature_count = df.shape[1]
hours_to_predict = 6
print("Number of features:", feature_count)
Number of features: 38
In [20]:
scaler = MinMaxScaler()
df_scaled = scaler.fit_transform(df)
df_scaled = pd.DataFrame(df_scaled, columns=df.columns)
validation_split=0.2
train_idx = int(len(df_scaled) * (1 - validation_split))
train_df = df_scaled.iloc[:train_idx]
test_df = df_scaled.iloc[train_idx:]
X_train, y_train = create_sequences(train_df, param, sequence_length, feature_count, hours_to_predict)
X_test, y_test = create_sequences(test_df, param, sequence_length, feature_count, hours_to_predict)
#X_test = X_test.reshape((X_test.shape[0], sequence_length, 1, 1, X_train.shape[2]))
#y_test = y_test
# Reshape for ConvLSTM
X_train = X_train.reshape((X_train.shape[0], sequence_length, 1, 1, X_train.shape[2]))
X_test = X_test.reshape((X_test.shape[0], sequence_length, 1, 1, X_test.shape[2]))
In [21]:
model = Sequential([
ConvLSTM2D(filters=64, kernel_size=(1, 1), activation='relu',
input_shape=(sequence_length, 1, 1, feature_count), return_sequences=True),
ConvLSTM2D(filters=32, kernel_size=(1, 1), activation='relu', return_sequences=False),
Flatten(),
Dense(50, activation='relu'),
Dense(hours_to_predict)
])
optimizer = Adam(learning_rate=0.001, clipvalue=1.0)
model.compile(optimizer=optimizer, loss='mse')
2024-04-29 20:44:58.547453: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:282] failed call to cuInit: CUDA_ERROR_UNKNOWN: unknown error 2024-04-29 20:44:58.547623: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:134] retrieving CUDA diagnostic information for host: babylap 2024-04-29 20:44:58.547644: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:141] hostname: babylap 2024-04-29 20:44:58.547851: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:165] libcuda reported version is: 535.171.4 2024-04-29 20:44:58.547898: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:169] kernel reported version is: 535.171.4 2024-04-29 20:44:58.547910: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:248] kernel version seems to match DSO: 535.171.4 /home/yjess/anaconda3/lib/python3.11/site-packages/keras/src/layers/rnn/rnn.py:204: UserWarning: Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead. super().__init__(**kwargs)
In [22]:
epochs=2
batch_size=32
checkpoint_path = "checkpoints/model_epoch_{epoch:02d}_val_loss_{val_loss:.2f}.weights.h5"
checkpoint = ModelCheckpoint(
filepath=checkpoint_path, save_weights_only=True,
monitor='val_loss', mode='min', save_best_only=True, verbose=1
)
model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size,
validation_data=(X_test, y_test), callbacks=[checkpoint])
Epoch 1/2
2024-04-29 20:44:59.438709: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 306647232 exceeds 10% of free system memory.
2626/2627 ━━━━━━━━━━━━━━━━━━━━ 0s 16ms/step - loss: 0.0078 Epoch 1: val_loss improved from inf to 0.00094, saving model to checkpoints/model_epoch_01_val_loss_0.00.weights.h5 2627/2627 ━━━━━━━━━━━━━━━━━━━━ 50s 18ms/step - loss: 0.0078 - val_loss: 9.3723e-04 Epoch 2/2 2625/2627 ━━━━━━━━━━━━━━━━━━━━ 0s 19ms/step - loss: 9.5452e-04 Epoch 2: val_loss improved from 0.00094 to 0.00084, saving model to checkpoints/model_epoch_02_val_loss_0.00.weights.h5 2627/2627 ━━━━━━━━━━━━━━━━━━━━ 52s 20ms/step - loss: 9.5447e-04 - val_loss: 8.4307e-04
Out[22]:
<keras.src.callbacks.history.History at 0x7f28170cc610>
In [23]:
y_pred = model.predict(X_test).flatten()
y_test = y_test.flatten() # Ensure y_test is flat if it's not already
dummy_array = np.zeros((y_pred.shape[0], feature_count))
dummy_array[:, 3] = y_pred.squeeze()
test_dummy_array = np.zeros((y_test.shape[0], feature_count))
test_dummy_array[:, 3] = y_test.squeeze()
test_y = scaler.inverse_transform(test_dummy_array)[:, 3]
predictions_y = scaler.inverse_transform(dummy_array)[:, 3]
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
print("MSE ", mse, "RMSE ", rmse, "MAE ", mae)
plt.figure(figsize=(15, 6))
plt.subplot(1, 2, 1)
plt.plot(y_test, label='Actual', linestyle='-')
plt.plot(y_pred, label='Predicted', linestyle='--')
plt.title('Detailed View of Predictions vs. Actual')
plt.xlabel('Time')
plt.ylabel('Value')
plt.legend()
# Scatter plot of the actual vs predicted
plt.subplot(1, 2, 2)
plt.scatter(y_test, y_pred, alpha=0.6)
plt.title('Prediction vs Actual')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], 'r--') # Ideal predictions line
plt.tight_layout()
plt.show()
657/657 ━━━━━━━━━━━━━━━━━━━━ 4s 6ms/step MSE 0.0008430658443101905 RMSE 0.0290355961590285 MAE 0.0214544413842388
In [24]:
# Assuming 'get_data_for_convlstm' is defined elsewhere and properly implemented
data_for_prediction, time_index_pre_data = get_data_for_convlstm('2024-04-20T00', '2024-04-28T08')
# Ensure scaler is fitted; if it's the first use, fit on some initial data or the current data
scaler = MinMaxScaler()
scaler.fit(data_for_prediction) # Fit if not already done; else skip
# Transform the data for prediction
df_scaled = scaler.transform(data_for_prediction)
df_scaled = pd.DataFrame(df_scaled, columns=data_for_prediction.columns)
latest_sequence = df_scaled.tail(sequence_length).values.reshape(1, sequence_length, 1, 1, feature_count)
future_temperatures = model.predict(latest_sequence)
dummy_output_array = np.zeros((hours_to_predict, scaler.n_features_in_))
dummy_output_array[:, 3] = future_temperatures.flatten() # Ensure to flatten if necessary
predicted_temps = scaler.inverse_transform(dummy_output_array)[:, 3]
# Timestamp handling
data_for_prediction.index = time_index_pre_data
last_timestamp = data_for_prediction.index[-1]
prediction_frequency = 'H'
new_timestamps = pd.date_range(start=last_timestamp + pd.Timedelta(hours=1), periods=len(predicted_temps), freq=prediction_frequency)
predictions_df = pd.DataFrame(predicted_temps, index=new_timestamps, columns=['Predicted T2M'])
# Fetch and set the actual data for comparison
data_actual_for_prediction, time_index_actual_data = get_data_for_convlstm('2024-04-28T08', '2024-04-29T00')
data_actual_for_prediction.index = time_index_actual_data
# Plotting the predictions alongside the actual data
fig = go.Figure()
fig.add_trace(go.Scatter(x=data_for_prediction.index, y=data_for_prediction['T2M'], mode='lines', name='Original T2M', line=dict(color='blue')))
fig.add_trace(go.Scatter(x=predictions_df.index, y=predictions_df['Predicted T2M'], mode='lines+markers', name='Predicted T2M', line=dict(color='red', dash='dash')))
fig.add_trace(go.Scatter(x=data_actual_for_prediction.index, y=data_actual_for_prediction['T2M'], mode='lines', name='Actual T2M', line=dict(color='green')))
fig.update_layout(
title='Temperature Predictions vs Actuals',
xaxis_title='Date',
yaxis_title='Temperature (°C)',
hovermode="x unified"
)
fig.show()
https://dataset.api.hub.geosphere.at/v1/timeseries/historical/inca-v1-1h-1km?parameters=RR¶meters=GL¶meters=P0¶meters=T2M¶meters=TD2M¶meters=RH2M¶meters=UU¶meters=VV&start=2024-04-20T00%3A00&end=2024-04-28T08%3A00&lat_lon=48.333056%2C16.631944&lat_lon=48.115278%2C16.175833&output_format=geojson&filename=INCA_Vienna_timeseries_all_parameters_2024_2024 Fetched data successfully 1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 23ms/step https://dataset.api.hub.geosphere.at/v1/timeseries/historical/inca-v1-1h-1km?parameters=RR¶meters=GL¶meters=P0¶meters=T2M¶meters=TD2M¶meters=RH2M¶meters=UU¶meters=VV&start=2024-04-28T08%3A00&end=2024-04-29T00%3A00&lat_lon=48.333056%2C16.631944&lat_lon=48.115278%2C16.175833&output_format=geojson&filename=INCA_Vienna_timeseries_all_parameters_2024_2024
/tmp/ipykernel_38954/935309804.py:13: FutureWarning: DataFrame.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead. df.fillna(method='ffill', inplace=True) # Forward fill first to propagate last valid observation forward /tmp/ipykernel_38954/935309804.py:14: FutureWarning: DataFrame.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead. df.fillna(method='bfill', inplace=True) # Backward fill to ensure no NaNs remain
Fetched data successfully
/tmp/ipykernel_38954/935309804.py:13: FutureWarning: DataFrame.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead. df.fillna(method='ffill', inplace=True) # Forward fill first to propagate last valid observation forward /tmp/ipykernel_38954/935309804.py:14: FutureWarning: DataFrame.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead. df.fillna(method='bfill', inplace=True) # Backward fill to ensure no NaNs remain
In [25]:
formatted_date_time = datetime.now().strftime("%Y%m%d_%H%M%S")
model_folder_name = f"model_folder{formatted_date_time}"
os.makedirs(model_folder_name, exist_ok=True)
model.save(os.path.join(model_folder_name, 'model.keras'))
joblib.dump(scaler, os.path.join(model_folder_name, 'scaler.pkl'))
Out[25]:
['model_folder20240429_204649/scaler.pkl']
In [26]:
loaded_model = tf.keras.models.load_model(os.path.join(model_folder_name, 'model.keras'))
loaded_scaler = joblib.load(os.path.join(model_folder_name, 'scaler.pkl'))
In [ ]:
In [ ]: